# install packages
# install.packages("plyr")
# install.packages("dplyr")
# install.packages("ggplot2")
# install.packages("ggmap")
# install.packages("RColorBrewer")
# declare libraries
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggmap)
library(RColorBrewer)
# read in citibike data
citiBike <- read.csv("201511-citibike-tripdata.csv")
# function that creates a vector of gradient colors based on a size input and vector of colors
gradient <- function(n, colors){
colfunc <- colorRampPalette(colors)
return(colfunc(n))
}
Key question: Which stations see the most asymmetric traffic (more arrivals than departures and vice versa)?
# Get counts of number of times each station was used as an end station and start station.
endStation<-as.data.frame(table(citiBike$end.station.id))
startStation<-as.data.frame(table(citiBike$start.station.id))
# Merge the two data frames into one
stationData <- merge(startStation, endStation, by="Var1", all=TRUE)
# Set stations that dont have a count (an NA value) to 0
stationData[is.na(stationData)] <- 0
# Rename the columns
names(stationData) <- c("station.id", "start.freq", "end.freq")
# Calculate incoming and outgoing traffic
stationData$inTraffic <- stationData$start.freq - stationData$end.freq
stationData$outTraffic <- stationData$end.freq - stationData$start.freq
# Order the stations by the most outgoing traffic and graph
outTraffic <- stationData[order(-stationData$outTraffic),]
barplot(outTraffic$outTraffic[outTraffic$outTraffic > 300],
names.arg = outTraffic$station.id[outTraffic$outTraffic > 300], ylim = c(0, 1000),
col = gradient(12, c("darkgreen", "palegreen3", "palegreen2")),
density = 90, border = TRUE, las = 2, main="Bike Stations with Highest Surplus",
xlab = "Station IDs", ylab = "Surplus of Bikes (Arrivals - Departures)",
cex.axis = 0.75, cex.names = 0.75, cex.lab = 0.9)
# Order the stations by the most incoming traffic and graph
inTraffic <- stationData[order(-stationData$inTraffic),]
barplot(inTraffic$inTraffic[inTraffic$inTraffic > 300],
names.arg = inTraffic$station.id[inTraffic$inTraffic > 300],
ylim = c(0, 2500), col = gradient(11, c("darkred", "red", "darkorange", "orange")),
density = 90, border = TRUE, las = 2, main="Bike Stations with Highest Deficit",
xlab = "Station IDs", ylab = "Deficit of Bikes (Departures - Arrivals)",
cex.axis = 0.75, cex.names = 0.75, cex.lab = 0.9)
Which stations originate the longest rides? Does this change as we go through different times of the day?
# Calculate the mean ride duration for all stations, listed them by start.station.id
longestRide <- aggregate(citiBike[, 1], list(citiBike$start.station.id), mean)
names(longestRide) <- c("station.id", "mean.trip.duration")
# Merge into dataframe from part 1
stationData <- merge(stationData, longestRide, by="station.id", all=TRUE)
stationData[is.na(stationData)] <- 0
# Order data by trip duration and graph stations with rides of average duration of over 2000
longestRide <- stationData[order(-stationData$mean.trip.duration),]
barplot(longestRide$mean.trip.duration[longestRide$mean.trip.duration > 2000], main = "Stations that Originate the Longest Rides", names.arg = longestRide$station.id[longestRide$mean.trip.duration > 2000], xlab = "Station IDs", ylab = "Ride Duration")
# Determine what time of day each ride started in: early morning, morning, afternoon, evening, or late evening
citiBike$time.slot = strtoi(format(as.POSIXct(citiBike$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)
citiBike$time.slot.category[citiBike$time.slot < 8] = "earlyMorning"
citiBike$time.slot.category[citiBike$time.slot < 12 & citiBike$time.slot >= 8] <- "morning"
citiBike$time.slot.category[citiBike$time.slot < 16 & citiBike$time.slot >= 12] <- "afternoon"
citiBike$time.slot.category[citiBike$time.slot < 20 & citiBike$time.slot >= 16] <- "evening"
citiBike$time.slot.category[citiBike$time.slot >= 20] <- "lateEvening"
# Calculate mean duration for each station and time of day combination
longest.ride.category <- aggregate(citiBike[, 1], list(citiBike$start.station.id, citiBike$time.slot.category), mean)
names(longest.ride.category) <- c("station.id", "time.slot", "mean")
longest.ride.category <- longest.ride.category[order(-longest.ride.category$mean),]
# Create graphs for each of the time of days
early.morning.ride = longest.ride.category[longest.ride.category$time.slot == "earlyMorning",]
barplot(early.morning.ride$mean[early.morning.ride$mean > 2500], names.arg = early.morning.ride$station.id[early.morning.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Early Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")
morning.ride = longest.ride.category[longest.ride.category$time.slot == "morning",]
barplot(morning.ride$mean[morning.ride$mean > 2500], names.arg = morning.ride$station.id[morning.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Morning", xlab = "Station IDs", ylab = "Mean Ride Duration")
afternoon.ride = longest.ride.category[longest.ride.category$time.slot == "afternoon",]
barplot(afternoon.ride$mean[afternoon.ride$mean > 3000], names.arg = afternoon.ride$station.id[afternoon.ride$mean > 3000], main="Stations that Originate the Longest Rides in the Afternoon", xlab = "Station IDs", ylab = "Mean Ride Duration")
evening.ride = longest.ride.category[longest.ride.category$time.slot == "evening",]
barplot(evening.ride$mean[evening.ride$mean > 2500], names.arg = evening.ride$station.id[evening.ride$mean > 2500], main="Stations that Originate the Longest Rides in the Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")
late.evening.ride = longest.ride.category[longest.ride.category$time.slot == "lateEvening",]
barplot(late.evening.ride$mean[late.evening.ride$mean > 3000], names.arg = late.evening.ride$station.id[late.evening.ride$mean > 3000], main = "Stations that Originate the Longest Rides in the Late Evening", xlab = "Station IDs", ylab = "Mean Ride Duration")
# change
# ===============================================================================
# ===============================[calculations]==================================
# ===============================================================================
# =============================[most used routes]================================
# function returns a unique integer from two integer inputs
cantor_pairing_function <- function(a, b){
return(0.5 * (a + b) * (a + b + 1) + b)
}
# get all routes taken
routes <- subset(citiBike, select = c(start.station.id, end.station.id, start.station.latitude, start.station.longitude, end.station.latitude, end.station.longitude, usertype))
colnames(routes) <- c("start.id", "end.id", "start.lat", "start.lon", "end.lat", "end.lon", "user")
# apply unique route.id based on the cantor pairing function, with start.id and end.id as inputs
routes$route.id = cantor_pairing_function(routes$start.id, routes$end.id)
# seperate by users
subscriber_routes <- subset(routes, user == "Subscriber")
customer_routes <- subset(routes, user == "Customer")
routes <- subset(routes, select = -c(user))
# count # of times a route has been used
route_counts <- routes %>%
count(route.id)
colnames(route_counts) <- c("route.id", "route.count")
# get only unique routes
unique_routes <- unique(routes)
# merge route count with ids, lats, and longitudes
route_counts <- join(unique_routes, route_counts, by = "route.id", type = "full", match = "first")
route_counts <- subset(route_counts, select = -c(route.id))
# sort by highest count
route_counts <- route_counts[with(route_counts, order(-route.count)), ]
# =============================[route usage by user]============================
# count # of times a route has been used for subscribers
subscriber_routes <- subscriber_routes %>%
count(route.id)
colnames(subscriber_routes) <- c("route.id", "route.count")
subscriber_routes <- join(unique_routes, subscriber_routes, by = "route.id", type = "full", match = "first")
subscriber_routes <- subset(subscriber_routes, select = -c(route.id))
subscriber_routes <- subscriber_routes[with(subscriber_routes, order(-route.count)), ]
# count # of times a route has been used for customers
customer_routes <- customer_routes %>%
count(route.id)
colnames(customer_routes) <- c("route.id", "route.count")
customer_routes <- join(unique_routes, customer_routes, by = "route.id", type = "full", match = "first")
customer_routes <- subset(customer_routes, select = -c(route.id))
customer_routes <- customer_routes[with(customer_routes, order(-route.count)), ]
# =============================[station surplus]================================
# get arrivals to station
arrival_counts <- citiBike %>%
count(end.station.id)
colnames(arrival_counts) <- c("id", "arr")
# get departures from station
departure_counts <- citiBike %>%
count(start.station.id)
colnames(departure_counts) <- c("id", "dep")
# get unique stations
stations <- unique(subset(citiBike, select = c(start.station.id, start.station.latitude, start.station.longitude)))
colnames(stations) <- c("id", "lat", "lon")
# merge arrivals and departures with list of stations
stations <- join(stations, arrival_counts, by = "id", type = "full", match = "first")
stations <- join(stations, departure_counts, by = "id", type = "full", match = "first")
# calculate surplus of station
stations$surplus = stations$arr - stations$dep
# seperate surplus and deficit stations
stations.surplus <- subset(stations, surplus >= 0)
stations.deficit <- subset(stations, surplus <= 0)
# ===============================================================================
# ==================================[mapping]====================================
# ===============================================================================
min_lat <- min(c(min(citiBike$start.station.latitude), min(citiBike$end.station.latitude))) + 0.025
max_lat <- max(c(max(citiBike$start.station.latitude), max(citiBike$end.station.latitude))) + 0.0055
min_lon <- min(c(min(citiBike$start.station.longitude), min(citiBike$end.station.longitude))) + 0.029
max_lon <- max(c(max(citiBike$start.station.longitude), max(citiBike$end.station.longitude))) + 0.005
nyc <- c(left = min_lon, bottom = min_lat, right = max_lon, top = max_lat)
basemap <- get_map(nyc, zoom = 13, maptype = "terrain", source = "stamen")
## Map from URL : http://tile.stamen.com/terrain/13/2411/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3077.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3078.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3079.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3080.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2411/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2412/3081.jpg
## Map from URL : http://tile.stamen.com/terrain/13/2413/3081.jpg
# =============================[most used routes]================================
# declare variables
elements <- 20
origin <- c()
dest <- c()
# get routes
for(i in 1:elements){
origin <- append(origin, paste(route_counts$start.lat[i], route_counts$start.lon[i], sep=","))
dest <- append(dest, paste(route_counts$end.lat[i], route_counts$end.lon[i], sep=","))
}
# initialize routes map
map.routes <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))
# build routes map
for(i in 1:(elements)){
k <- (i - 1) / elements
map.routes <- map.routes +
geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
# to deal with Google API request limit
Sys.sleep(0.5)
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.76590936,-73.97634151&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.7454973,-74.00197139&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71273266,-74.0046073&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.74691959,-74.00451887&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71921585,-73.9424469&destination=40.715143,-73.944507&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74232744,-73.95411749&destination=40.74731,-73.95451&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.74096374,-73.98602213&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7643971,-73.97371465&destination=40.7643971,-73.97371465&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7612274,-73.96094022&destination=40.76500525,-73.95818491&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77536905,-73.94803392&destination=40.77801203,-73.95407149&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.715143,-73.944507&destination=40.71921585,-73.9424469&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.73028666,-73.9907647&destination=40.72740794,-73.98142006&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75640548,-73.9900262&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.711512,-74.015756&destination=40.701907,-74.013942&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75019995,-73.99093085&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76500525,-73.95818491&destination=40.76440023,-73.96648977&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72456089,-73.99565293&destination=40.73028666,-73.9907647&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74734825,-73.99723551&destination=40.75066386,-74.00176802&mode=driving&units=metric&alternatives=false&sensor=false
# display most used routes map
map.routes
# =============================[most used routes by subscribers]================================
# declare variables
elements <- 20
origin <- c()
dest <- c()
# get routes
for(i in 1:elements){
origin <- append(origin, paste(subscriber_routes$start.lat[i], subscriber_routes$start.lon[i], sep=","))
dest <- append(dest, paste(subscriber_routes$end.lat[i], subscriber_routes$end.lon[i], sep=","))
}
# initialize routes map
map.routes.subscribers <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))
# build routes map
for(i in 1:(elements)){
k <- (i - 1) / elements
map.routes.subscribers <- map.routes.subscribers +
geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
# to deal with Google API request limit
Sys.sleep(0.5)
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.7454973,-74.00197139&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71921585,-73.9424469&destination=40.715143,-73.944507&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74173969,-73.99415556&destination=40.74691959,-74.00451887&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74232744,-73.95411749&destination=40.74731,-73.95451&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.74096374,-73.98602213&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7612274,-73.96094022&destination=40.76500525,-73.95818491&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77536905,-73.94803392&destination=40.77801203,-73.95407149&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.715143,-73.944507&destination=40.71921585,-73.9424469&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75640548,-73.9900262&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.73028666,-73.9907647&destination=40.72740794,-73.98142006&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.751873,-73.977706&destination=40.75019995,-73.99093085&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74734825,-73.99723551&destination=40.75066386,-74.00176802&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76500525,-73.95818491&destination=40.76440023,-73.96648977&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.74731,-73.95451&destination=40.74232744,-73.95411749&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71745169,-73.95850939&destination=40.716887,-73.963198&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.70834698,-74.01713445&destination=40.7153379,-74.01658354&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72456089,-73.99565293&destination=40.73028666,-73.9907647&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.72740794,-73.98142006&destination=40.73028666,-73.9907647&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7153379,-74.01658354&destination=40.72043411,-74.01020609&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76312584,-73.96526895&destination=40.76500525,-73.95818491&mode=driving&units=metric&alternatives=false&sensor=false
# display most used routes map
map.routes.subscribers
# =============================[most used routes by customers]================================
# declare variables
elements <- 20
origin <- c()
dest <- c()
# get routes
for(i in 1:elements){
origin <- append(origin, paste(customer_routes$start.lat[i], customer_routes$start.lon[i], sep=","))
dest <- append(dest, paste(customer_routes$end.lat[i], customer_routes$end.lon[i], sep=","))
}
# initialize routes map
map.routes.customers <- ggmap(basemap, base_layer = ggplot(stations, aes(x = lon, y = lat)))
# build routes map
for(i in 1:(elements)){
k <- (i - 1) / elements
map.routes.customers <- map.routes.customers +
geom_path(aes(x = lon, y = lat), data = route(origin[i], dest[i], structure = "route"),
color = "darkred", size = 2 - (k * (2 - 0.5)), alpha = 1 - (k * (1 - 0.5)))
# to deal with Google API request limit
Sys.sleep(0.5)
}
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.76590936,-73.97634151&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71273266,-74.0046073&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7643971,-73.97371465&destination=40.7643971,-73.97371465&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76590936,-73.97634151&destination=40.77282817,-73.96685276&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.70277159,-73.99383605&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.76590936,-73.97634151&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7643971,-73.97371465&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76915505,-73.98191841&destination=40.76915505,-73.98191841&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.71273266,-74.0046073&destination=40.69597683,-73.99014892&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.76087502,-74.00277668&destination=40.71754834,-74.01322069&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77282817,-73.96685276&destination=40.77282817,-73.96685276&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77282817,-73.96685276&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.78472675,-73.96961715&destination=40.78472675,-73.96961715&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.77282817,-73.96685276&destination=40.7768286343997,-73.9638876914978&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.7768286343997,-73.9638876914978&destination=40.7643971,-73.97371465&mode=driving&units=metric&alternatives=false&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/directions/json?origin=40.69991755,-73.98971773&destination=40.71273266,-74.0046073&mode=driving&units=metric&alternatives=false&sensor=false
# display most used routes map
map.routes.customers
# =============================[station surplus]================================
# build surplus stations map
map.surplus <- ggmap(basemap, base_layer = ggplot(stations.surplus, aes(x = lon, y = lat))) +
geom_point(aes(color = surplus, size = surplus), alpha = .8) +
scale_color_gradient(low = "green", high = "darkgreen") +
scale_size(range = c(0.5, 10))
# build deficit stations map
map.deficit <- ggmap(basemap, base_layer = ggplot(stations.deficit, aes(x = lon, y = lat))) +
geom_point(aes(color = surplus, size = surplus), alpha = .8) +
scale_color_gradient(low = "darkred", high = "red") +
scale_size(range = c(30, 1))
# display surplus and deficit maps
map.surplus
map.deficit
# The stations that are start stations the most number of times are likely the best candidates for a higher bike storage capacity
orderedByMostIncomingTraffic <- stationData[order(stationData$outTraffic),]
head(orderedByMostIncomingTraffic)
## station.id start.freq end.freq inTraffic outTraffic mean.trip.duration
## 460 3230 5634 3164 2470 -2470 912.0309
## 466 3236 2340 1134 1206 -1206 1413.0620
## 286 519 11267 10232 1035 -1035 916.9943
## 284 517 5008 4168 840 -840 828.6024
## 278 511 4603 3978 625 -625 805.1271
## 168 391 1640 1182 458 -458 1724.0012
# Find number of unique bike ids
bikeCapacity <- floor(nrow(as.data.frame(table(citiBike$bikeid))) / nrow(stationData))
orderedByMostIncomingTraffic <- stationData[order(stationData$outTraffic),]
# Show station that had most bikes leave than arrive; show station that had most bikes arrive than leave. Due to memory restraints, we cannot show all stations. If we could, we would iterate through every station in orderedByMostIncomingTraffic and show the plot for each station.
stId <- 1
for(i in 1:2)
{
stationId <- orderedByMostIncomingTraffic[stId,1]
stationInvolved <- citiBike[(citiBike$start.station.id == stationId | citiBike$end.station.id == stationId),]
numDoubleStations <- nrow(citiBike[(citiBike$start.station.id == stationId & citiBike$end.station.id == stationId),])
# vectors to hold capacity at a certain time
timeV <- character(nrow(stationInvolved) + numDoubleStations)
capacityV <- integer(nrow(stationInvolved) + numDoubleStations)
# initial values
timeV[1] <- "11/1/2015 00:00:00"
capacityV[1] <- bikeCapacity
timeVIndex <- 2
capacityVIndex <- 2
currCapacity = bikeCapacity
# iterate through everytime the station is either a start or end station
for(j in 1:nrow(stationInvolved))
{
currRow <- stationInvolved[j,]
if(currRow$start.station.id == stationId)
{
currCapacity <- currCapacity - 1
timeV[timeVIndex] <- toString(currRow$starttime)
capacityV[capacityVIndex] <- currCapacity
timeVIndex <- timeVIndex + 1
capacityVIndex <- capacityVIndex + 1
}
if(currRow$end.station.id == stationId)
{
currCapacity <- currCapacity + 1
timeV[timeVIndex] <- toString(currRow$stoptime)
capacityV[capacityVIndex] <- currCapacity
timeVIndex <- timeVIndex + 1
capacityVIndex <- capacityVIndex + 1
}
}
timeVsCapacity <- data.frame(timeV, capacityV)
colnames(timeVsCapacity) = c("Time", "Capacity")
timeVsCapacity$Time <- factor(timeVsCapacity$Time, levels=unique(as.character(timeVsCapacity$Time)))
plot(timeVsCapacity$Time, timeVsCapacity$Capacity, main=stationId, xlab="Time", ylab="Number of bikes at station", type="n")
lines(timeVsCapacity$Time, timeVsCapacity$Capacity, type="o")
stId <- 475
}
First failed approach: tried to get information on frequency that each Bike ID is used as a way to gague demand. I realized though that this wouldn’t lead to any substantive recommendations because there would still be lots of unanswered questions: for example, one bike may be rented a lot but the trip duration for each ride is extremely short. So the code is below but my second attempt centers around trip duration which felt more acurrate.
if (FALSE) {
# Condensed data frame of just bikeID and frequency it's used
demand <- citiBike %>%
group_by(bikeid) %>%
select(bikeid) %>%
summarise(
count = count(citiBike, "bikeid")
)
#rideTime = sum(citiBike$tripduration, na.rm = TRUE) THIS LINE IS GIVING ME ISSUES
# trying to sort the data
# don't think this is working though
sortedDemand <- demand[order(-demand$count),]
head(sortedDemand)
# plotting to see frequnecy patters
plot(sortedDemand) # looks the same as below
plot(demand, main = "Rental Frequencies by Bike ID", xlab = "Bike ID", ylab = "Frequency of Renting")
# this shows that the bikes within the 22000 - 24000 bike IDs have higher frequency of usage so yes it looks like there perhaps are some bikes that are used more significantly than others
max(citiBike$bikeid)
min(citiBike$bikeid)
# range of bike IDs is from 14529 - 24765
}
Second attempt approach: I felt that this led to more tangible recommendations and less unanswered questions. My approach was to sum up trip durations by bike ID, leading to a list of bikes that are clearly more used than others. My graph features a blue line indicating the average trip duration and the red line indicates median trip duration. These two lines are relatively close to each other.
# summing trip durations and grouping by BikeID
tripSum <- tapply(citiBike$tripduration, citiBike$bikeid, sum)
head(tripSum)
## 14529 14530 14531 14532 14533 14534
## 89537 31966 34212 39165 133242 49372
tripDurations <- as.data.frame(as.table(tripSum))
names(tripDurations) <- c("bikeID", "minutes")
#sorted by most minutes on bike
tripDurations <- tripDurations[order(-tripDurations$minutes),]
# visualizing the information
barplot(height = tripDurations[1:100,2],xlab = "Bike ID", ylab = "Total Time on Bike by minutes", main = "Top 200 Bike Riding Time Totals")
# Blue Line Indicates Averages
abline(h = mean(tripDurations$minutes, na.rm = TRUE), col = "blue")
# Red Line Indicates Median
abline(h = median(tripDurations$minutes, na.rm = TRUE), col = "red")
##### Clearly, of the tens of thousands of CitiBikes, there is an upper echelon of bikes that have significantly higher total ride times. They have been identified by bikeID in the data frame “tripDurations”. Our recommendation would be to target those top 100-200 bikes with more frequent maintenance in order to overall drive down unnecessary follow up maintenance costs.
#Filter users born after 1991 into a separate data frame
young.customers <- subset(citiBike, citiBike$birth.year > 1991)
young.startStation <- as.data.frame(table(young.customers$start.station.id))
#Sort data frame of frequency of use of certain bike stations in increasing order:
sorted.young <- young.startStation[order(young.startStation$Freq),]
colnames(sorted.young) <- c("Start Station ID","# of times used")
print(head(sorted.young, n=10),row.names=FALSE)
## Start Station ID # of times used
## 3052 1
## 3127 1
## 3221 1
## 418 2
## 3044 2
## 3089 2
## 3117 2
## 233 3
## 436 3
## 3043 3
#Filter users that are customers and not subscribers into a separate data frame
tourist.customers <- subset(citiBike,citiBike$usertype == "Customer")
tourist.startStation <- as.data.frame(table(tourist.customers$start.station.id))
#Sort data frame of frequency of use of certain bike stations in increasing order:
sorted.tourist <- tourist.startStation[order(tourist.startStation$Freq),]
colnames(sorted.tourist) <- c("Start Station ID","# of times used")
print(head(sorted.tourist, n=10),row.names=FALSE)
## Start Station ID # of times used
## 2005 1
## 3054 2
## 218 5
## 3059 5
## 3049 8
## 3221 8
## 3070 10
## 3048 11
## 3053 11
## 3061 11
Key question: Is there a difference in peak bike usage on weekends and weekdays?
# Separate the bike data into weekends versus weekdays
citiBike$day = format(as.POSIXct(citiBike$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%u")
citiBikeWeekday <- citiBike[citiBike$day < 6,]
citiBikeWeekend <- citiBike[citiBike$day >= 6,]
citiBikeWeekday$hour = strtoi(format(as.POSIXct(citiBikeWeekday$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)
citiBikeWeekend$hour = strtoi(format(as.POSIXct(citiBikeWeekend$starttime, format = "%m/%d/%Y %H:%M:%S"), format = "%H"), base = 10L)
# Get the counts of number of rides for each hour
weekdayStart <- as.data.frame(table(citiBikeWeekday$hour))
weekendStart <- as.data.frame(table(citiBikeWeekend$hour))
# Plot
barplot(weekdayStart$Freq, names.arg = weekdayStart$Var1, main = "Weekday CitiBike Usage by Time", xlab = "Time (Hour)", ylab = "Number of Users")
barplot(weekendStart$Freq, names.arg = weekendStart$Var1, main = "Weekend CitiBike Usage by Time", xlab = "Time (Hour)", ylab = "Number of Users")
Additional analysis: Genders within User Types
Key question: Is there a big shift within the gender of both our customers and our subscribers?
genders <- citiBike %>%
group_by(usertype) %>%
select(gender)
genderTable <- table(genders)
# 1 is Male
# 2 is Female
# We see that that there is no gender information for customers, so the barplots will illustrate gender
# discrepencies within Subscribers.
labels = c("unidentified", "male", "female")
barplot(genderTable, main = "Breakdown of Genders within Subscribers", xlab = "Gender", ylab = "Frequency", names.arg = labels)